Web scraping MyAnimeList.net

MyAnimelist.net scraped reviews and info of 5953 animes. 1. web scraping 2. reviews: EDA 3. anime info: network graph eda 4. predict anime score

  1. web scraping Done with scrapy. Scraped 2 parts of MyAnimelist.net: anime reviews and anime info. 89244 reviews, 5953 anime info downloaded. Lesson learned:
  1. reviews EDA
library(ggplot2)
library(dplyr)
AnimeReviews <- readRDS("AnimeReviews/AnimeReviews.RDS")
summary(AnimeReviews)
##  anime_title            rating        anime_url         review_text       
##  Length:89244       Min.   : 0.000   Length:89244       Length:89244      
##  Class :character   1st Qu.: 6.000   Class :character   Class :character  
##  Mode  :character   Median : 8.000   Mode  :character   Mode  :character  
##                     Mean   : 7.545                                        
##                     3rd Qu.: 9.000                                        
##                     Max.   :10.000                                        
##  review_time          reviewer           pic_url         
##  Length:89244       Length:89244       Length:89244      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##       date                    
##  Min.   :2006-11-07 00:00:00  
##  1st Qu.:2012-01-06 00:00:00  
##  Median :2014-04-01 00:00:00  
##  Mean   :2013-08-25 01:41:54  
##  3rd Qu.:2015-10-04 00:00:00  
##  Max.   :2016-11-07 00:00:00
#head(AnimeReviews)
summ_review <- AnimeReviews %>%
    select(anime_title) %>% 
    group_by(anime_title) %>%
    summarise(Count = n()) %>% 
    arrange(desc(Count))
summary(summ_review$Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    3.00   15.02   11.00 1112.00
head(summ_review, 3)
## # A tibble: 3 × 2
##        anime_title Count
##              <chr> <int>
## 1 Sword Art Online  1112
## 2     Angel Beats!   731
## 3       Death Note   656
p1 <- ggplot(AnimeReviews) +
    geom_freqpoly(aes(x=date), bins = 60) +
    xlab("Review Date") + ylab("Number of Reviews") +
    theme_bw()
p1

review1 <- AnimeReviews[AnimeReviews$anime_title == "Sword Art Online", ]
p2 <- ggplot(review1) +
    geom_freqpoly(aes(x=date), bins = 30) +
    xlab("Review Date") + ylab("Number of Reviews") +
    theme_bw() + ggtitle("Reviews of 'Sword Art Online'") 
p2

  1. anime info network graph
library(visNetwork)
library(car)
AnimeInfo <- readRDS("AnimeInfo/AnimeInfo.RDS")
length(AnimeInfo)
## [1] 5953
names(AnimeInfo[[1]])
##  [1] "anime_title"      "anime_status"     "anime_popularity"
##  [4] "anime_studios"    "anime_genres"     "anime_favorites" 
##  [7] "anime_synopsis"   "anime_aired"      "anime_premiered" 
## [10] "anime_related"    "anime_mainactors" "anime_producers" 
## [13] "anime_score"      "anime_ranked"     "anime_staff"     
## [16] "anime_rating"     "anime_background" "anime_episodes"  
## [19] "anime_type"       "anime_members"
set.seed(0)
graph_list <- sample(AnimeInfo, 100)  ##randomly choose 100 anime for network graph
mklink_lst <- function(lst){
    link_list <- data.frame(from = numeric(), to = numeric(), 
                            value = numeric(), title = character())
    l <- length(lst)
    #########check common elements in staff and cast members #####################################
    actor_lst <-lapply(lst, function(x) x$anime_mainactors)
    staff_lst <-lapply(lst, function(x) unlist(x$anime_staff))
    for (i in c(1:l-1)){
        for (j in c(i+1:l)){
            temp_actor <- intersect(unlist(actor_lst[i]), unlist(actor_lst[j]))
            temp_staff <- intersect(unlist(staff_lst[i]), unlist(staff_lst[j]))
            if (length(temp_actor) > 0 | length(temp_staff) > 0){
                if (length(temp_actor) > 0){
                    title_actor_temp <- paste0("Cast: ", temp_actor, collapse = '<br>')
                }
                if (length(temp_staff) > 0){
                    title_staff_temp <- paste0("Staff: ", temp_staff, collapse = '<br>')
                }
                if (length(temp_actor) > 0 & length(temp_staff) > 0){
                    title_temp <- paste0(title_staff_temp,'<br>', title_actor_temp)
                    value_t <- length(temp_staff) + length(temp_actor)
                }else if (length(temp_actor) > 0){
                    title_temp <- title_actor_temp
                    value_t <- length(temp_actor)
                }else{
                        title_temp <- title_staff_temp
                        value_t <- length(temp_staff)
                        }
                row <- data.frame(from = i, to = j,
                                  value = value_t, title = title_temp)
                link_list <- rbind(link_list, row)
            }
        }
    }
    return(link_list)
}

mknode_lst <- function(lst){
    node_list <- data.frame(ID = c(1:length(lst)))
    titles <- unlist(lapply(lst, function(x) x$anime_title))
    node_list$Titles <- titles
    types <- unlist(lapply(lst, function(x) x$anime_type))
    node_list$Types <- types
    members <- unlist(lapply(lst, function(x) x$anime_members))
    members <- as.numeric(gsub("\\,", "", members))
    node_list$Members <- members
    colnames(node_list) <- c("id", "title", "group", "size")
    node_list$size <- 2*sqrt(sqrt(node_list$size))
    node_list$label <- NA
    return(node_list)
}
link_list <- mklink_lst(graph_list)
node_list<- mknode_lst(graph_list)
graph <- visNetwork(nodes = node_list, edges = link_list, main = "Anime") %>%
    visNodes(label = NULL) %>% 
    visLegend(enabled = TRUE, useGroups = TRUE, addNodes = NULL,
              addEdges = NULL, width = 0.2, position = "left", main = NULL) %>% 
    visOptions(selectedBy = "group", highlightNearest = TRUE)
graph
  1. predict anime score
Title <- unlist(lapply(AnimeInfo, function(x) x$anime_title))
Type <- unlist(lapply(AnimeInfo, function(x) x$anime_type))
Rating <- unlist(lapply(AnimeInfo, function(x) x$anime_rating))
Viewer <-unlist(lapply(AnimeInfo, function(x) as.numeric(gsub("\\,", "", x$anime_members))))
Score <- unlist(lapply(AnimeInfo, function(x) as.numeric(x$anime_score)))
Favorite <- unlist(lapply(AnimeInfo, function(x) as.numeric(gsub("\\,", "", x$anime_favorites))))
Actor1_lst <- lapply(AnimeInfo, function(x) x$anime_mainactor[1])
flag <- unlist(lapply(Actor1_lst, function(x) is.null(x[[1]])))
Actor1_lst[flag] <- NA
Actor1 <- unlist(Actor1_lst)
Actor2_lst <- lapply(AnimeInfo, function(x) x$anime_mainactor[2])
flag <- unlist(lapply(Actor2_lst, function(x) is.null(x[[1]])))
Actor2_lst[flag] <- NA
Actor2 <- unlist(Actor2_lst)
Actor3_lst <- lapply(AnimeInfo, function(x) x$anime_mainactor[3])
flag <- unlist(lapply(Actor3_lst, function(x) is.null(x[[1]])))
Actor3_lst[flag] <- NA
Actor3 <- unlist(Actor3_lst)
Actor4_lst <- lapply(AnimeInfo, function(x) x$anime_mainactor[4])
flag <- unlist(lapply(Actor4_lst, function(x) is.null(x[[1]])))
Actor4_lst[flag] <- NA
Actor4 <- unlist(Actor4_lst)
Director_lst <- lapply(AnimeInfo, function(x) x$anime_staff$Director)
flag <- unlist(lapply(Director_lst, function(x) is.null(x[[1]])))
Director_lst[flag] <- NA
Director <- unlist(Director_lst)
Musician_lst <- lapply(AnimeInfo, function(x) x$anime_staff$Music)
flag <- unlist(lapply(Musician_lst, function(x) is.null(x[[1]])))
Musician_lst[flag] <- NA
Musician <- unlist(Musician_lst)
AnimeInfo_df <- data.frame(Title = Title, Type = Type, Rating = Rating,
                           Director = Director, Actor1 = Actor1, 
                           Actor2 = Actor2, Actor3 = Actor3,
                           Actor4 = Actor4, Musician = Musician,
                           Score = Score,
                           Viewer = Viewer, Favorite = Favorite
                           )
table(AnimeInfo_df$Rating)
## 
##                   G - All Ages                           None 
##                            686                             31 
##                  PG - Children      PG-13 - Teens 13 or older 
##                            351                           2815 
## R - 17+ (violence & profanity)               R+ - Mild Nudity 
##                            722                            628 
##                    Rx - Hentai 
##                            720
#Upon investigation, all animes missing 'Rating' are old animes produced before rating system came and they all look #like "G"-rating animes.
#Set all their ratings to "G - All Ages".
AnimeInfo_df$Rating[AnimeInfo_df$Rating== "None"] <- "G - All Ages"
AnimeInfo_df$Rating <- as.character(AnimeInfo_df$Rating)
AnimeInfo_df$Rating <- as.factor(AnimeInfo_df$Rating)

# dummify director variable
# list of anime directors who won awards in last 15-20 years at Tokyo Anime Award or Animation Kobe
direct_award <- c("Daichi, Akitaro", "Miyazaki, Hayao", "Hara, Keiichi", "Kon, Satoshi", 
                  "Miyazaki, Hayao", "Tomino, Yoshiyuki", "Hosoda, Mamoru", "Anno, Hideaki",
                  "Miyazaki, Hayao", "Hosoda, Mamoru", "Yonebayashi, Hiromasa", "Shinbo, Akiyuki",
                  "Hosoda, Mamoru", "Araki, Tetsuro", "Takahata, Isao", "Fujita, Yōichi",
                  "Anno, Hideaki", "Miyazaki, Hayao", "Watanabe, Shinichi", "Daichi, Akitaro",
                  "Okiura, Hiroyuki", "Kitakubo, Hiroyuki", "Hara, Keiichi", "Kuroda, Yosuke",
                  "Kamiyama, Kenji", "Yoshida, Kenichi", "Nagahama, Hiroshi", "Imaishi, Hiroyuki",
                  "Iso, Mitsuo", "Kato, Kunio", "Hosoda, Mamoru", "Okada, Mari",
                  "Agematsu, Noriyasu", "Mizushima, Tsutomu","Kishi, Seiji", "Mizushima, Seiji")
direct_award <- unique(direct_award)
AnimeInfo_df$Award_Director <- ifelse(AnimeInfo_df$Director %in% direct_award, TRUE, FALSE)
## didn't have enough time to do the same thing on actors and musicians
##multi-variable linear regression
model <- glm(Score ~ Award_Director + Viewer + Favorite + Type + Rating, data = AnimeInfo_df)
summary(model)
## 
## Call:
## glm(formula = Score ~ Award_Director + Viewer + Favorite + Type + 
##     Rating, data = AnimeInfo_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7681  -0.3781   0.0714   0.4819   3.2648  
## 
## Coefficients:
##                                        Estimate Std. Error t value
## (Intercept)                           6.735e+00  3.486e-02 193.173
## Award_DirectorTRUE                    3.733e-01  5.212e-02   7.163
## Viewer                                5.395e-06  2.327e-07  23.182
## Favorite                             -3.646e-05  5.283e-06  -6.901
## TypeMusic                            -5.566e-01  8.036e-02  -6.926
## TypeONA                              -7.764e-01  5.816e-02 -13.349
## TypeOVA                              -2.281e-01  3.493e-02  -6.530
## TypeSpecial                           5.017e-03  3.966e-02   0.127
## TypeTV                                1.777e-02  3.040e-02   0.584
## RatingPG - Children                  -8.318e-03  4.906e-02  -0.170
## RatingPG-13 - Teens 13 or older       2.321e-01  3.262e-02   7.115
## RatingR - 17+ (violence & profanity)  1.474e-01  4.143e-02   3.559
## RatingR+ - Mild Nudity               -1.865e-01  4.274e-02  -4.364
## RatingRx - Hentai                    -1.337e-01  4.632e-02  -2.886
##                                      Pr(>|t|)    
## (Intercept)                           < 2e-16 ***
## Award_DirectorTRUE                   8.82e-13 ***
## Viewer                                < 2e-16 ***
## Favorite                             5.72e-12 ***
## TypeMusic                            4.80e-12 ***
## TypeONA                               < 2e-16 ***
## TypeOVA                              7.12e-11 ***
## TypeSpecial                          0.899335    
## TypeTV                               0.558947    
## RatingPG - Children                  0.865357    
## RatingPG-13 - Teens 13 or older      1.25e-12 ***
## RatingR - 17+ (violence & profanity) 0.000375 ***
## RatingR+ - Mild Nudity               1.30e-05 ***
## RatingRx - Hentai                    0.003916 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.5564911)
## 
##     Null deviance: 4651.4  on 5952  degrees of freedom
## Residual deviance: 3305.0  on 5939  degrees of freedom
## AIC: 13421
## 
## Number of Fisher Scoring iterations: 2
plot(model)

avPlots(model)

vif(model)
##                    GVIF Df GVIF^(1/(2*Df))
## Award_Director 1.025117  1        1.012481
## Viewer         3.089501  1        1.757698
## Favorite       2.654953  1        1.629403
## Type           1.853266  5        1.063638
## Rating         1.751659  5        1.057657
model_2 <- glm(Score ~ Award_Director + Favorite + Type + Rating, data = AnimeInfo_df)
summary(model_2)
## 
## Call:
## glm(formula = Score ~ Award_Director + Favorite + Type + Rating, 
##     data = AnimeInfo_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7822  -0.4149   0.0797   0.5036   3.2468  
## 
## Coefficients:
##                                        Estimate Std. Error t value
## (Intercept)                           6.753e+00  3.640e-02 185.553
## Award_DirectorTRUE                    4.748e-01  5.423e-02   8.755
## Favorite                              5.871e-05  3.473e-06  16.904
## TypeMusic                            -6.097e-01  8.388e-02  -7.269
## TypeONA                              -8.341e-01  6.067e-02 -13.747
## TypeOVA                              -3.166e-01  3.625e-02  -8.732
## TypeSpecial                          -4.088e-02  4.136e-02  -0.988
## TypeTV                                1.211e-01  3.140e-02   3.858
## RatingPG - Children                  -6.464e-03  5.122e-02  -0.126
## RatingPG-13 - Teens 13 or older       3.537e-01  3.362e-02  10.519
## RatingR - 17+ (violence & profanity)  3.558e-01  4.223e-02   8.425
## RatingR+ - Mild Nudity               -2.208e-02  4.400e-02  -0.502
## RatingRx - Hentai                    -4.841e-02  4.821e-02  -1.004
##                                      Pr(>|t|)    
## (Intercept)                           < 2e-16 ***
## Award_DirectorTRUE                    < 2e-16 ***
## Favorite                              < 2e-16 ***
## TypeMusic                             4.1e-13 ***
## TypeONA                               < 2e-16 ***
## TypeOVA                               < 2e-16 ***
## TypeSpecial                          0.322989    
## TypeTV                               0.000116 ***
## RatingPG - Children                  0.899588    
## RatingPG-13 - Teens 13 or older       < 2e-16 ***
## RatingR - 17+ (violence & profanity)  < 2e-16 ***
## RatingR+ - Mild Nudity               0.615886    
## RatingRx - Hentai                    0.315376    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.6067446)
## 
##     Null deviance: 4651.4  on 5952  degrees of freedom
## Residual deviance: 3604.1  on 5940  degrees of freedom
## AIC: 13934
## 
## Number of Fisher Scoring iterations: 2
plot(model_2)

avPlots(model_2)

vif(model_2)
##                    GVIF Df GVIF^(1/(2*Df))
## Award_Director 1.017893  1        1.008907
## Favorite       1.052151  1        1.025744
## Type           1.672155  5        1.052756
## Rating         1.634392  5        1.050354
model_dev <- model$deviance
model_df <- model$df.residual
model_2_dev <- model_2$deviance
model_2_df <- model_2$df.residual
pchisq(model_2_dev - model_dev, model_2_df - model_df, lower.tail = FALSE)
## [1] 5.273187e-67
anova(model_2, model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: Score ~ Award_Director + Favorite + Type + Rating
## Model 2: Score ~ Award_Director + Viewer + Favorite + Type + Rating
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      5940     3604.1                          
## 2      5939     3305.0  1   299.06 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R2 <- 1-model$deviance/model$null.deviance
R2
## [1] 0.2894562

The partial F-test yields p-value of < 2.2e-16, variable “Viewer” does add information to the model. Significant coeffiecients: (Intercept), Award_DirectorTRUE, Favorite, Viewer, TypeMusic, TypeONA, TypeOVA, RatingPG-13 - Teens 13 or older, RatingR - 17+ (violence & profanity), RatingR+ - Mild Nudity, RatingRx - Hentai.

Adjusted R-squared: 0.2236 only explains 22.36% of the variance in scores of animes.